# Regression. Numeric and Categorical Predictors. Dummy Variables and Interactions.
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Set working directory and load data
os.chdir("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data")  # Change the working directory
Auto = pd.read_csv("Auto.csv")  # Read the data file in the CSV format

# Prepare data frames for X and Y, fit a linear regression model, and plot the regression line with data

OurDataFrame = pd.DataFrame(Auto)
Weight = OurDataFrame['weight']
MPG    = OurDataFrame['mpg']
X      = sm.add_constant(Weight)

reg = sm.OLS( MPG, X ).fit()
print(reg.summary())

plt.scatter(Weight, MPG, label='Data', s=15)
plt.plot(Weight, reg.predict(X), color='red', label='Regression Line')
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon'); plt.title('Linear regression line'); 
plt.legend(); plt.show()

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.692
Model:                            OLS   Adj. R-squared:                  0.691
Method:                 Least Squares   F-statistic:                     886.6
Date:                Wed, 31 Jul 2024   Prob (F-statistic):          5.37e-103
Time:                        10:29:59   Log-Likelihood:                -1146.0
No. Observations:                 397   AIC:                             2296.
Df Residuals:                     395   BIC:                             2304.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         46.3174      0.796     58.166      0.000      44.752      47.883
weight        -0.0077      0.000    -29.776      0.000      -0.008      -0.007
==============================================================================
Omnibus:                       40.133   Durbin-Watson:                   0.797
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               56.057
Skew:                           0.712   Prob(JB):                     6.72e-13
Kurtosis:                       4.166   Cond. No.                     1.13e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.13e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

# Is it the same linear relationship for American, Asian, and European cars?

# Map colors according to 'origin' and plot weight vs mpg colored by the origin
Auto['color'] = Auto['origin'].map({1: 'orange', 2: 'blue', 3: 'green'})  # Replace the color names with your desired colors

plt.scatter(Auto['weight'], Auto['mpg'], c=Auto['color'])
plt.xlabel('Weight'); plt.ylabel('MPG')
plt.title('Weight vs MPG colored by Continent')
plt.show()

# The line becomes less steep for larger cars.

# Fit a linear regression model with dummy variables, allowing different INTERCEPTS

DummyColumns = pd.get_dummies(Auto['origin'], dtype=int, drop_first=True)    # Prepare dummy variables
Dummies = pd.get_dummies(DummyColumns, dtype=int)
Dummies = Dummies.rename(columns={2:'origin2', 3:'origin3'})
X = sm.add_constant(Dummies)

reg_dummies = sm.OLS(MPG, X).fit()
print(reg_dummies.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.333
Model:                            OLS   Adj. R-squared:                  0.330
Method:                 Least Squares   F-statistic:                     98.45
Date:                Wed, 31 Jul 2024   Prob (F-statistic):           2.12e-35
Time:                        10:40:18   Log-Likelihood:                -1299.2
No. Observations:                 397   AIC:                             2604.
Df Residuals:                     394   BIC:                             2616.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         20.0718      0.407     49.339      0.000      19.272      20.872
origin2        7.8197      0.867      9.018      0.000       6.115       9.524
origin3       10.3789      0.828     12.540      0.000       8.752      12.006
==============================================================================
Omnibus:                       25.088   Durbin-Watson:                   0.753
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               28.611
Skew:                           0.657   Prob(JB):                     6.13e-07
Kurtosis:                       3.020   Cond. No.                         3.16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

# Plot the resulting regression lines in different colors

plt.scatter(Weight, MPG, c=Auto['color'], s=20)
plt.scatter(Weight, reg_dummies.predict(X), c=Auto['color'])
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon'); 
plt.title('Regression with different intercepts and no slopes'); 
plt.show()

# Include a common slope. 
X1 = pd.concat([X, Weight], axis=1)

reg_int_oneslope = sm.OLS(MPG,X1).fit()
print(reg_int_oneslope.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.702
Model:                            OLS   Adj. R-squared:                  0.699
Method:                 Least Squares   F-statistic:                     307.9
Date:                Wed, 31 Jul 2024   Prob (F-statistic):          8.82e-103
Time:                        10:47:43   Log-Likelihood:                -1139.6
No. Observations:                 397   AIC:                             2287.
Df Residuals:                     393   BIC:                             2303.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         43.6896      1.107     39.481      0.000      41.514      45.865
origin2        1.2190      0.654      1.865      0.063      -0.066       2.504
origin3        2.3592      0.663      3.556      0.000       1.055       3.663
weight        -0.0070      0.000    -22.021      0.000      -0.008      -0.006
==============================================================================
Omnibus:                       37.597   Durbin-Watson:                   0.813
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               54.086
Skew:                           0.662   Prob(JB):                     1.80e-12
Kurtosis:                       4.232   Cond. No.                     1.82e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.82e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

# This regression will have a slightly better R^2 and adjusted R^2 than the original model. 
# The grouping and the slope are significant. Plot:

plt.scatter(Weight, MPG, c=Auto['color'], s=20)
plt.scatter(Weight, reg_int_oneslope.predict(X1), c=Auto['color'])
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon'); 
plt.title('Regression with different intercepts and one common slope'); 
plt.show()

# Fit a linear regression model with interaction terms, allowing different slopes for different continents
X1['origin2_weight'] = X1['origin2']*X1['weight']
X1['origin3_weight'] = X1['origin3']*X1['weight']

reg_interactions = sm.OLS(MPG,X1).fit()
reg_interactions.summary()

# This regression will have a slightly better R^2 and adjusted R^2 than the original model. 
# The grouping and the slope are significant. Plot:

plt.scatter(Weight, MPG, c=Auto['color'], s=20)
plt.scatter(Weight, reg_interactions.predict(X1), c=Auto['color'])
plt.xlabel('Automobile weight'); plt.ylabel('Miles per gallon'); 
plt.title('Regression with origin-weight interactions'); 
plt.show()

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	42.9846	1.179	36.465	0.000	40.667	45.302
origin2	2.3912	2.847	0.840	0.401	-3.206	7.988
origin3	11.2755	3.583	3.147	0.002	4.231	18.320
weight	-0.0068	0.000	-19.973	0.000	-0.007	-0.006
origin2_weight	-0.0004	0.001	-0.365	0.715	-0.003	0.002
origin3_weight	-0.0039	0.002	-2.527	0.012	-0.007	-0.001

Dep. Variable:	mpg	R-squared:	0.706
Model:	OLS	Adj. R-squared:	0.703
Method:	Least Squares	F-statistic:	188.1
Date:	Wed, 31 Jul 2024	Prob (F-statistic):	1.14e-101
Time:	10:54:14	Log-Likelihood:	-1136.4
No. Observations:	397	AIC:	2285.
Df Residuals:	391	BIC:	2309.
Df Model:	5
Covariance Type:	nonrobust

Omnibus:	42.084	Durbin-Watson:	0.819
Prob(Omnibus):	0.000	Jarque-Bera (JB):	61.346
Skew:	0.720	Prob(JB):	4.78e-14
Kurtosis:	4.278	Cond. No.	5.36e+04